{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "from collections import Counter, defaultdict\n",
    "import os\n",
    "from nltk.tokenize import sent_tokenize, word_tokenize"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Loading data #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pos_thresh = 7\n",
    "neg_thresh = 3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The next line references the directory \"train-imbd\". You can get this data [here](https://github.com/jacobeisenstein/gt-nlp-class/releases/download/imbd-fall-2015/imbd-data.tgz)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "x = [] # list of counters\n",
    "y = [] # list of ints\n",
    "for filename in glob('train-imdb/*.txt'):\n",
    "    basename = os.path.basename(filename)\n",
    "    score = int(basename.split('_')[1].split('.')[0])\n",
    "    if score >= pos_thresh or score <= neg_thresh:\n",
    "        with open(filename) as fin:\n",
    "            counts = Counter()\n",
    "            for line in fin:\n",
    "                try:\n",
    "                    for sent in sent_tokenize(line):\n",
    "                        for word in word_tokenize(sent):\n",
    "                            counts[word.lower()] += 1 #try it without downcasing!\n",
    "                except:\n",
    "                    pass\n",
    "            if len(counts)> 0:\n",
    "                x.append(counts)\n",
    "                if score >= pos_thresh:\n",
    "                    y.append(1)\n",
    "                else: y.append(-1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Making predictions #\n",
    "\n",
    "The following function returns the total count of all feats specified in the \"feats\" list."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "score_doc = lambda counts, feats : sum([counts[feat] for feat in feats])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "score_doc(x[10],['excellent','striking','brilliant'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# make a prediction, using two word lists\n",
    "def makePreds(counts,pos_words,neg_words):\n",
    "    preds = []\n",
    "    for doc in x:\n",
    "        if score_doc(doc,pos_words) > score_doc(doc,neg_words):\n",
    "            preds.append(1)\n",
    "        elif score_doc(doc,pos_words) < score_doc(doc,neg_words):\n",
    "            preds.append(-1)\n",
    "        else: preds.append(0) # don't know!\n",
    "    return preds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "preds = makePreds(x,['excellent','striking','brilliant'],['boring','bad','terrible'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy 0.287167891464\n"
     ]
    }
   ],
   "source": [
    "print 'accuracy', sum([(pred==label) for pred,label in zip(preds,y)])/float(len(y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "no prediction 0.624646693047\n"
     ]
    }
   ],
   "source": [
    "print 'no prediction', sum([pred==0 for pred in preds]) / float(len(preds))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Precision and recall #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# true positives: correctly predicted to be positive\n",
    "true_pos = sum([(pred==label and pred==1) for pred,label in zip(preds,y)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pred_pos = sum([pred==1 for pred in preds])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "precision(+) 0.9\n"
     ]
    }
   ],
   "source": [
    "print 'precision(+)', true_pos / float(pred_pos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tot_pos = sum([label==1 for label in y])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "recall(+) 0.154858299595\n"
     ]
    }
   ],
   "source": [
    "print 'recall(+)', true_pos/float(tot_pos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "true_neg = sum([(pred==label and pred==-1) for pred,label in zip(preds,y)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pred_neg = sum([pred==-1 for pred in preds])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "precision(-) 0.718623481781\n"
     ]
    }
   ],
   "source": [
    "print 'precision(-)', true_neg / float(pred_neg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tot_neg = sum([label==-1 for label in y])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "recall(-) 0.454545454545\n"
     ]
    }
   ],
   "source": [
    "print 'recall(-)', true_neg/float(tot_neg)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
